Data preparations

load("XSTSF_production.RData")
source('functions.R')
f0_all_ct <- f0_all_pre %>% filter(focus_condition == 'ct' ) %>% 
  group_by(speaker) %>% 
  mutate(norm_f0 = scale(log(f0))) %>% 
  ungroup()

f0_mono <- f0_all_ct %>% filter(is.na(diortri) == TRUE)

distri_prop2(f0_mono, hist_tone1, sync_tone1)

f0_mono %>% filter(hist_tone1 == 'yangshang' & sync_tone1 == 'LH') %>% select(token, speaker) %>% distinct()
## # A tibble: 5 × 2
##   token speaker
##   <chr> <fct>  
## 1 买    S1     
## 2 买    S2     
## 3 买    S3     
## 4 买    S6     
## 5 买    S8
f0_mono %>% filter(hist_tone1 == 'yangping' & sync_tone1 == 'LH') %>% select(token, speaker) %>% distinct()
## # A tibble: 2 × 2
##   token speaker
##   <chr> <fct>  
## 1 梅    S3     
## 2 莓    S6
f0_mono %>% filter(token == '买') %>% select(token, speaker, sync_tone1) %>% distinct()
## # A tibble: 8 × 3
##   token speaker sync_tone1
##   <chr> <fct>   <chr>     
## 1 买    S1      LH        
## 2 买    S2      LH        
## 3 买    S3      LH        
## 4 买    S4      RF        
## 5 买    S5      RF        
## 6 买    S6      LH        
## 7 买    S7      RF        
## 8 买    S8      LH

Categorisation

A function to plot f0 contours

p_cluster <- function(df_cluster, x, y = NULL,  avg_line_width = 2.5){
  p_cluster <- df_cluster %>% 
    ggplot(aes(x = time, y = norm_f0, group = ind_no, color = {{x}}, 
               text = paste('speaker: ', speaker, 
                            '\ntoken_no: ', citation_no, 
                            '\ntoken: ', token, 
                            '\ntime: ', time, 
                            '\nnorm_f0: ', norm_f0))) +
    geom_line(alpha = 0.2) +
    scale_color_ptol() +
    stat_summary(fun = mean, geom = "line", lwd = avg_line_width, aes(group = {{x}}), lty = 1) +
    xlab("Normalised time") +
    ylab("z-scores of log-f0") + 
    labs(color = "tone") +
    scale_color_manual(values = c("#4477AA", "#CC6677", "#DDCC77", "#117733"))+
    theme_minimal() +
    theme(legend.position = "top",
          text = element_text(family = 'Times New Roman', size = 20),
          axis.title.x = element_text(margin = margin(t = 10)),
          axis.title.y = element_text(margin = margin(r = 20)))
  
  if (!is.null(y)) {
    p_cluster <- p_cluster + facet_wrap(as.formula(paste("~", y)), ncol = 4, labeller = label_both)
  }
  
  return(p_cluster)
}

Auditory cluster

p_cluster(f0_mono, citation_tone, avg_line_width = 4)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

k-means clustering

functions

# compare k-means and human inspection cluster results
heatmap_data <- function(df, x){
  heatmap_df <- df %>% filter(time == 1) %>% 
    group_by(citation_tone, {{x}}) %>% 
    count() %>% 
    ungroup() %>% 
    group_by(citation_tone) %>% 
    mutate(count = sum(n),
           prop = n/count) %>% 
    ungroup()

  return(heatmap_df)
}

compare_cluster <- function(df, x){
  heatmap_plot<-ggplot(df,aes_string(x=x,y='citation_tone',fill='prop'))+
    # scale_fill_continuous(breaks=c(0,0.5,1))+
    geom_tile()+xlab("cluster")+labs(fill="Frequency")+
    geom_text(aes(label = sprintf("%.2f", prop)),size=5, color = 'white')+
    scale_fill_viridis(direction = -1)+
    #scale_color_viridis(direction = -1)+
    #scale_fill_gradient(low = "yellow", high = "green", breaks = c(0, 0.5, 1))+
    theme_minimal()+
    theme(text = element_text(family = 'Times New Roman', size = 20),
          axis.title.x = element_text(margin = margin(t = 15)),
          #axis.text.x = element_text(color = c("#4477AA", "#CC6677", "#DDCC77", "#117733"), face = "bold"),
          #axis.text.y = element_text(color = c("#4477AA", "#CC6677", "#DDCC77", "#117733"), face = "bold"),
          panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
    xlab('k-means cluster')+
    ylab('perceptual cluster')
  
  return(heatmap_plot)
}
# data preparation
f0_mono_kmeans <- f0_mono %>% 
  select(-diortri, -syllable_no, -focus_no, -f0) %>% 
  spread(time, norm_f0)

# k-means clustering
cluster_model <- k_means_clustering(f0_mono_kmeans)
kml(cluster_model, nbClusters = 2:10) 
##  ~ Fast KmL ~
## ***************************************************************************************************S
## 100 ********************************************************************************S
plotAllCriterion(cluster_model)

kml::plot(cluster_model, 4, parTraj=parTRAJ(col="clusters"))

# get cluster results
f0_mono_kmeans <- f0_mono_kmeans %>% 
  mutate(cluster4 = getClusters(cluster_model, 4))

f0_mono_kmeans <- wide_to_long(f0_mono_kmeans) 

k-means cluster visualisation

p_kmeans4 <- p_cluster(f0_mono_kmeans, cluster4);p_kmeans4

heatmap distribution

heatmap_df <- heatmap_data(f0_mono_kmeans, cluster4) 
p_htmap4 <- compare_cluster(heatmap_df, 'cluster4'); p_htmap4

Visualisation of tones

The plot below shows the numbers of tokens collected for each tone.

f0_mono_count_all <- f0_mono %>% 
  group_by(citation_tone) %>% 
  count() %>% 
  mutate(n = n/10) 
f0_mono_count_all
## # A tibble: 4 × 2
## # Groups:   citation_tone [4]
##   citation_tone     n
##   <chr>         <dbl>
## 1 HH              182
## 2 HL               58
## 3 LH               39
## 4 RF              153
f0_mono %>% 
  group_by(citation_tone, speaker) %>% 
  count() %>% 
  mutate(n = n/10) %>% 
  ggplot(aes(x = citation_tone, y = n, fill = speaker))+
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=1.6, color="black",
            position = position_dodge(0.9), size=3.5)+
  scale_fill_brewer(palette="Paired")

by speaker

p_cluster(f0_mono, citation_tone, 'speaker',  avg_line_width = 1.5)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

ggplotly(p_cluster(f0_mono, citation_tone, 'speaker',  avg_line_width = 1.5), 
         tooltip = c('text', 'x'))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
f0_mono %>% filter(speaker == "S7" & citation_tone == "HL") %>% select(ind_no) %>% distinct()
## # A tibble: 6 × 1
##   ind_no  
##   <chr>   
## 1 S7_12_ct
## 2 S7_18_ct
## 3 S7_23_ct
## 4 S7_25_ct
## 5 S7_37_ct
## 6 S7_48_ct